import sys
import jieba


if __name__ == '__main__':
    cat12_file = '../../../resource/cat_12.csv'

    # cat1, cat2, kw, cat3
    train_file = '/Users/hardy/data/train20_cat12.tsv'
    out_file = '/Users/hardy/data/train20_cat_yz.tsv'
    cat_dict = dict()

    for l in open(cat12_file):
        d = l.strip().split(',')
        print(d)
        if len(d) < 3:
            continue

        cat1 = d[1].strip()
        cat2 = d[2].strip()
        yz_cat = d[3].strip()
        cat_dict[cat1 + '\t' + cat2] = yz_cat

    print(cat_dict)
    with open(out_file, 'w+') as fd:
        for l in open(train_file):
            d = l.strip().split('\t')
            cat = cat_dict.get(d[0].strip() + '\t' + d[1].strip())
            if not cat:
                continue

            line = ' '.join(jieba.lcut(d[2]))
            fd.write(cat + '\t' + line + '\n')

